{
 "cells": [
  {
   "source": [
    "# Action1使用Wide&Deep模型对movielens进行评分预测\n",
    "1、完成代码（20points）     2、使用Wide&Deep工具进行评分预测，代码正确（20points）"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 训练过程"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from deepctr.feature_column import SparseFeat,get_feature_names\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "# import warnings\n",
    "# warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "\n",
    "data = pd.read_csv(\"ratings.csv\")\n",
    "sparse_features = [\"userId\", \"movieId\", 'timestamp']\n",
    "target = ['rating']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "         userId  movieId  rating   timestamp\n0             1        2     3.5  1112486027\n1             1       29     3.5  1112484676\n2             1       32     3.5  1112484819\n3             1       47     3.5  1112484727\n4             1       50     3.5  1112484580\n...         ...      ...     ...         ...\n1048570    7120      168     5.0  1175543061\n1048571    7120      253     4.0  1175542225\n1048572    7120      260     5.0  1175542035\n1048573    7120      261     4.0  1175543376\n1048574    7120      266     3.5  1175542454\n\n[1048575 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>rating</th>\n      <th>timestamp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>2</td>\n      <td>3.5</td>\n      <td>1112486027</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>29</td>\n      <td>3.5</td>\n      <td>1112484676</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1</td>\n      <td>32</td>\n      <td>3.5</td>\n      <td>1112484819</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1</td>\n      <td>47</td>\n      <td>3.5</td>\n      <td>1112484727</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1</td>\n      <td>50</td>\n      <td>3.5</td>\n      <td>1112484580</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1048570</th>\n      <td>7120</td>\n      <td>168</td>\n      <td>5.0</td>\n      <td>1175543061</td>\n    </tr>\n    <tr>\n      <th>1048571</th>\n      <td>7120</td>\n      <td>253</td>\n      <td>4.0</td>\n      <td>1175542225</td>\n    </tr>\n    <tr>\n      <th>1048572</th>\n      <td>7120</td>\n      <td>260</td>\n      <td>5.0</td>\n      <td>1175542035</td>\n    </tr>\n    <tr>\n      <th>1048573</th>\n      <td>7120</td>\n      <td>261</td>\n      <td>4.0</td>\n      <td>1175543376</td>\n    </tr>\n    <tr>\n      <th>1048574</th>\n      <td>7120</td>\n      <td>266</td>\n      <td>3.5</td>\n      <td>1175542454</td>\n    </tr>\n  </tbody>\n</table>\n<p>1048575 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对特征标签进行编码\n",
    "for feature in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data[feature] = lbe.fit_transform(data[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "         userId  movieId  rating  timestamp\n0             0        1     3.5     340880\n1             0       28     3.5     340785\n2             0       31     3.5     340801\n3             0       46     3.5     340790\n4             0       49     3.5     340774\n...         ...      ...     ...        ...\n1048570    7119      163     5.0     494540\n1048571    7119      247     4.0     494524\n1048572    7119      254     5.0     494519\n1048573    7119      255     4.0     494545\n1048574    7119      260     3.5     494530\n\n[1048575 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>userId</th>\n      <th>movieId</th>\n      <th>rating</th>\n      <th>timestamp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>1</td>\n      <td>3.5</td>\n      <td>340880</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0</td>\n      <td>28</td>\n      <td>3.5</td>\n      <td>340785</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0</td>\n      <td>31</td>\n      <td>3.5</td>\n      <td>340801</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0</td>\n      <td>46</td>\n      <td>3.5</td>\n      <td>340790</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0</td>\n      <td>49</td>\n      <td>3.5</td>\n      <td>340774</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1048570</th>\n      <td>7119</td>\n      <td>163</td>\n      <td>5.0</td>\n      <td>494540</td>\n    </tr>\n    <tr>\n      <th>1048571</th>\n      <td>7119</td>\n      <td>247</td>\n      <td>4.0</td>\n      <td>494524</td>\n    </tr>\n    <tr>\n      <th>1048572</th>\n      <td>7119</td>\n      <td>254</td>\n      <td>5.0</td>\n      <td>494519</td>\n    </tr>\n    <tr>\n      <th>1048573</th>\n      <td>7119</td>\n      <td>255</td>\n      <td>4.0</td>\n      <td>494545</td>\n    </tr>\n    <tr>\n      <th>1048574</th>\n      <td>7119</td>\n      <td>260</td>\n      <td>3.5</td>\n      <td>494530</td>\n    </tr>\n  </tbody>\n</table>\n<p>1048575 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "SparseFeat(name='userId', vocabulary_size=7120, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002B7000AC280>, embedding_name='userId', group_name='default_group', trainable=True)"
     },
     "metadata": {},
     "execution_count": 13
    }
   ],
   "source": [
    "SparseFeat('userId', data['userId'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计算每个特征中的 不同特征值的个数\n",
    "fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]\n",
    "linear_feature_columns = fixlen_feature_columns\n",
    "dnn_feature_columns = fixlen_feature_columns\n",
    "feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "[SparseFeat(name='userId', vocabulary_size=7120, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002B701D09D00>, embedding_name='userId', group_name='default_group', trainable=True),\n SparseFeat(name='movieId', vocabulary_size=14026, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002B7026A1970>, embedding_name='movieId', group_name='default_group', trainable=True),\n SparseFeat(name='timestamp', vocabulary_size=822889, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002B7026A1D90>, embedding_name='timestamp', group_name='default_group', trainable=True)]"
     },
     "metadata": {},
     "execution_count": 20
    }
   ],
   "source": [
    "linear_feature_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([   0,    0,    0, ..., 7119, 7119, 7119], dtype=int64)"
     },
     "metadata": {},
     "execution_count": 22
    }
   ],
   "source": [
    "data.userId.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "{'userId': array([3028, 5661,  855, ..., 6098, 3997,   28], dtype=int64),\n 'movieId': array([5903,  330, 2269, ..., 9210,  628,   91], dtype=int64),\n 'timestamp': array([794577,  15062, 231068, ..., 525539,  11137,  11402], dtype=int64)}"
     },
     "metadata": {},
     "execution_count": 24
    }
   ],
   "source": [
    "train_model_input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将数据集切分成训练集和测试集\n",
    "train, test = train_test_split(data, test_size=0.2)\n",
    "train_model_input = {name:train[name].values for name in feature_names}\n",
    "test_model_input = {name:test[name].values for name in feature_names}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "838860"
     },
     "metadata": {},
     "execution_count": 33
    }
   ],
   "source": [
    "len(train['timestamp'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/10\n",
      "2622/2622 [==============================] - 259s 99ms/step - loss: 0.9409 - mse: 0.9346 - val_loss: 0.7499 - val_mse: 0.7367\n",
      "Epoch 2/10\n",
      "2622/2622 [==============================] - 149s 57ms/step - loss: 0.3893 - mse: 0.3698 - val_loss: 0.7917 - val_mse: 0.7653\n",
      "Epoch 3/10\n",
      "2622/2622 [==============================] - 116s 44ms/step - loss: 0.4004 - mse: 0.3695 - val_loss: 0.8331 - val_mse: 0.7972\n",
      "Epoch 4/10\n",
      "2622/2622 [==============================] - 111s 43ms/step - loss: 0.3007 - mse: 0.2647 - val_loss: 0.8258 - val_mse: 0.7887\n",
      "Epoch 5/10\n",
      "2622/2622 [==============================] - 104s 39ms/step - loss: 0.2436 - mse: 0.2075 - val_loss: 0.8251 - val_mse: 0.7886\n",
      "Epoch 6/10\n",
      "2622/2622 [==============================] - 102s 39ms/step - loss: 0.2255 - mse: 0.1899 - val_loss: 0.8326 - val_mse: 0.7960\n",
      "Epoch 7/10\n",
      "2622/2622 [==============================] - 100s 38ms/step - loss: 0.2109 - mse: 0.1745 - val_loss: 0.8491 - val_mse: 0.8112\n",
      "Epoch 8/10\n",
      "2622/2622 [==============================] - 109s 42ms/step - loss: 0.2092 - mse: 0.1720 - val_loss: 0.8663 - val_mse: 0.8281\n",
      "Epoch 9/10\n",
      "2622/2622 [==============================] - 112s 43ms/step - loss: 0.2135 - mse: 0.1759 - val_loss: 0.8584 - val_mse: 0.8195\n",
      "Epoch 10/10\n",
      "2622/2622 [==============================] - 109s 41ms/step - loss: 0.1917 - mse: 0.1540 - val_loss: 0.8586 - val_mse: 0.8201\n"
     ]
    }
   ],
   "source": [
    "# 10epoch使用WDL进行训练\n",
    "from deepctr.models import WDL\n",
    "model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=True, validation_split=0.2, )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2622/2622 [==============================] - 117s 45ms/step - loss: 0.9451 - mse: 0.9387 - val_loss: 0.7505 - val_mse: 0.7372\n"
     ]
    }
   ],
   "source": [
    "# 1 epoch训练\n",
    "from deepctr.models import WDL\n",
    "model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[3.0677881],\n",
       "       [2.101383 ],\n",
       "       [4.28821  ],\n",
       "       ...,\n",
       "       [4.1646166],\n",
       "       [4.200996 ],\n",
       "       [3.806195 ]], dtype=float32)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用WDL进行预测\n",
    "pred_ans = model.predict(test_model_input, batch_size=256)\n",
    "pred_ans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test RMSE 0.8581375181169975\n"
     ]
    }
   ],
   "source": [
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test RMSE 0.9059249417032296\n"
     ]
    }
   ],
   "source": [
    "# 10epoch输出RMSE\n",
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)"
   ]
  },
  {
   "source": [
    "## NFM"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "2622/2622 [==============================] - 155s 59ms/step - loss: 1.0215 - mse: 1.0142 - val_loss: 0.7627 - val_mse: 0.7465\ntest RMSE 0.8652745229116595\n运行时间为： 158.18689227104187\n"
    }
   ],
   "source": [
    "# 使用NFM进行训练\n",
    "from deepctr.models import NFM\n",
    "import time\n",
    "time1 = time.time()\n",
    "model = NFM(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )\n",
    "# 使用NFM进行预测\n",
    "pred_ans = model.predict(test_model_input, batch_size=256)\n",
    "# 输出RMSE或MSE\n",
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)\n",
    "time2 = time.time()\n",
    "print('运行时间为：',time2-time1)"
   ]
  },
  {
   "source": [
    "## DeepFM"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "2622/2622 [==============================] - 146s 56ms/step - loss: 0.9358 - mse: 0.9293 - val_loss: 0.7522 - val_mse: 0.7389\ntest RMSE 0.8623804264940155\n运行时间为： 149.58299779891968\n"
    }
   ],
   "source": [
    "# 使用deepfm进行训练\n",
    "from deepctr.models import DeepFM\n",
    "import time\n",
    "time1 = time.time()\n",
    "model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )\n",
    "# 使用NFM进行预测\n",
    "pred_ans = model.predict(test_model_input, batch_size=256)\n",
    "# 输出RMSE或MSE\n",
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)\n",
    "time2 = time.time()\n",
    "print('运行时间为：',time2-time1)"
   ]
  },
  {
   "source": [
    "## WDL"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "2622/2622 [==============================] - 141s 54ms/step - loss: 0.9379 - mse: 0.9317 - val_loss: 0.7539 - val_mse: 0.7409\ntest RMSE 0.8628441342444185\n运行时间为： 143.85020852088928\n"
    }
   ],
   "source": [
    "# 使用WDL进行训练\n",
    "from deepctr.models import WDL\n",
    "import time\n",
    "time1 = time.time()\n",
    "model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )\n",
    "# 使用NFM进行预测\n",
    "pred_ans = model.predict(test_model_input, batch_size=256)\n",
    "# 输出RMSE或MSE\n",
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)\n",
    "time2 = time.time()\n",
    "print('运行时间为：',time2-time1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.0 64-bit",
   "language": "python",
   "name": "python_defaultSpec_1600740416700"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}